#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@author: leochen
@contact: xaoyaoyao@aliyun.com
@file: bufanbiz_spider.py
@time: 2018/08/26
"""
import re
import json
from scrapy.http.response.html import HtmlResponse
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options, executable_path=r"D:\projects\tools\chromedriver.exe")
browser.set_page_load_timeout(45)
browser.maximize_window()

def parse(url):
    soup = _load_file(url)
    links = soup.select('#J_popCtn a[class="cate_detail_tit_lk"]')
    items ={}
    for link in links:
        href = link.attrs['href']
        items['name'] = link.text.strip()
        items['url'] = href
        if href.count('cat=') > 0:
            print("'%s':'https:%s',"%(link.text, href))

def _load_file(path):

    with open(path, 'r', encoding='utf-8', errors='ignore') as file:
        soup = BeautifulSoup(file, "html5lib")
        return soup


parse('d:/a.html')

